In [1]:
from tensorflow import keras
from tensorflow.keras import layers
import pathlib
from tensorflow.keras.utils import image_dataset_from_directory

import pandas as pd
import pathlib
from pathlib import Path

import numpy as np
import pandas as pd

# plotting modules
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import plotly as plotly
plotly.offline.init_notebook_mode()

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf
from keras.utils import to_categorical
from keras.models import load_model

import plotly.graph_objects as go
from tensorflow.keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, precision_recall_curve, ConfusionMatrixDisplay
In [183]:
tf.config.list_physical_devices('GPU')
Out[183]:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
In [184]:
# turn off gpu logging
tf.debugging.set_log_device_placement(False)

Framing the Problem¶

Fetching Dataset from multiple sources¶

In [185]:
data_folder = pathlib.Path("../../../../../Downloads/hispathology")
In [2]:
data_folder = pathlib.Path("./Downloads/Datasets")
In [ ]:
# Histogram
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(data, kde=True)
plt.title('Histogram')

# Q-Q Plot
plt.subplot(1, 2, 2)
stats.probplot(data, dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.show()
In [186]:
len(list(data_folder.glob('*')))
Out[186]:
280
In [187]:
list(data_folder.glob('*'))[:10]
Out[187]:
[WindowsPath('../../../../../Downloads/hispathology/10253'),
 WindowsPath('../../../../../Downloads/hispathology/10254'),
 WindowsPath('../../../../../Downloads/hispathology/10255'),
 WindowsPath('../../../../../Downloads/hispathology/10256'),
 WindowsPath('../../../../../Downloads/hispathology/10257'),
 WindowsPath('../../../../../Downloads/hispathology/10258'),
 WindowsPath('../../../../../Downloads/hispathology/10259'),
 WindowsPath('../../../../../Downloads/hispathology/10260'),
 WindowsPath('../../../../../Downloads/hispathology/10261'),
 WindowsPath('../../../../../Downloads/hispathology/10262')]
In [188]:
patient_data_folder = data_folder / 'IDC_regular_ps50_idx5'
In [189]:
len(list(patient_data_folder.glob('*')))
Out[189]:
279
In [190]:
patient_folders = [x for x in patient_data_folder.glob('*') if x.is_dir()]
patient_folders = [x.name for x in patient_folders]
patient_folders[:10]
Out[190]:
['10253',
 '10254',
 '10255',
 '10256',
 '10257',
 '10258',
 '10259',
 '10260',
 '10261',
 '10262']
In [ ]:
 
In [191]:
total_images = 0
for patient_folder in patient_folders:
    patient_folder_path = patient_data_folder / patient_folder
    subfolders = ['0', '1']  # List of subfolders you're interested in
    for subfolder in subfolders:
        subfolder_path = patient_folder_path / subfolder
        images = list(subfolder_path.glob('*'))
        total_images += len(images)
        print(f'Patient folder {patient_folder}/{subfolder} contains {len(images)} images.')
Patient folder 10253/0 contains 479 images.
Patient folder 10253/1 contains 70 images.
Patient folder 10254/0 contains 772 images.
Patient folder 10254/1 contains 76 images.
Patient folder 10255/0 contains 181 images.
Patient folder 10255/1 contains 91 images.
Patient folder 10256/0 contains 351 images.
Patient folder 10256/1 contains 117 images.
Patient folder 10257/0 contains 427 images.
Patient folder 10257/1 contains 208 images.
Patient folder 10258/0 contains 422 images.
Patient folder 10258/1 contains 108 images.
Patient folder 10259/0 contains 1434 images.
Patient folder 10259/1 contains 31 images.
Patient folder 10260/0 contains 928 images.
Patient folder 10260/1 contains 361 images.
Patient folder 10261/0 contains 590 images.
Patient folder 10261/1 contains 56 images.
Patient folder 10262/0 contains 1053 images.
Patient folder 10262/1 contains 754 images.
Patient folder 10264/0 contains 617 images.
Patient folder 10264/1 contains 587 images.
Patient folder 10268/0 contains 2086 images.
Patient folder 10268/1 contains 23 images.
Patient folder 10269/0 contains 904 images.
Patient folder 10269/1 contains 250 images.
Patient folder 10272/0 contains 2150 images.
Patient folder 10272/1 contains 25 images.
Patient folder 10273/0 contains 811 images.
Patient folder 10273/1 contains 1211 images.
Patient folder 10274/0 contains 659 images.
Patient folder 10274/1 contains 219 images.
Patient folder 10275/0 contains 297 images.
Patient folder 10275/1 contains 760 images.
Patient folder 10276/0 contains 591 images.
Patient folder 10276/1 contains 348 images.
Patient folder 10277/0 contains 785 images.
Patient folder 10277/1 contains 170 images.
Patient folder 10278/0 contains 1068 images.
Patient folder 10278/1 contains 91 images.
Patient folder 10279/0 contains 1267 images.
Patient folder 10279/1 contains 427 images.
Patient folder 10282/0 contains 1835 images.
Patient folder 10282/1 contains 198 images.
Patient folder 10285/0 contains 1011 images.
Patient folder 10285/1 contains 222 images.
Patient folder 10286/0 contains 458 images.
Patient folder 10286/1 contains 162 images.
Patient folder 10288/0 contains 2231 images.
Patient folder 10288/1 contains 47 images.
Patient folder 10290/0 contains 1891 images.
Patient folder 10290/1 contains 140 images.
Patient folder 10291/0 contains 999 images.
Patient folder 10291/1 contains 213 images.
Patient folder 10292/0 contains 998 images.
Patient folder 10292/1 contains 487 images.
Patient folder 10293/0 contains 649 images.
Patient folder 10293/1 contains 221 images.
Patient folder 10295/0 contains 761 images.
Patient folder 10295/1 contains 134 images.
Patient folder 10299/0 contains 759 images.
Patient folder 10299/1 contains 1347 images.
Patient folder 10300/0 contains 1464 images.
Patient folder 10300/1 contains 29 images.
Patient folder 10301/0 contains 1074 images.
Patient folder 10301/1 contains 342 images.
Patient folder 10302/0 contains 598 images.
Patient folder 10302/1 contains 1309 images.
Patient folder 10303/0 contains 579 images.
Patient folder 10303/1 contains 773 images.
Patient folder 10304/0 contains 779 images.
Patient folder 10304/1 contains 101 images.
Patient folder 10305/0 contains 1802 images.
Patient folder 10305/1 contains 19 images.
Patient folder 10306/0 contains 751 images.
Patient folder 10306/1 contains 273 images.
Patient folder 10307/0 contains 915 images.
Patient folder 10307/1 contains 80 images.
Patient folder 10308/0 contains 1383 images.
Patient folder 10308/1 contains 895 images.
Patient folder 12241/0 contains 37 images.
Patient folder 12241/1 contains 115 images.
Patient folder 12242/0 contains 668 images.
Patient folder 12242/1 contains 429 images.
Patient folder 12626/0 contains 1088 images.
Patient folder 12626/1 contains 254 images.
Patient folder 12748/0 contains 168 images.
Patient folder 12748/1 contains 198 images.
Patient folder 12749/0 contains 1199 images.
Patient folder 12749/1 contains 563 images.
Patient folder 12750/0 contains 1413 images.
Patient folder 12750/1 contains 21 images.
Patient folder 12751/0 contains 849 images.
Patient folder 12751/1 contains 967 images.
Patient folder 12752/0 contains 464 images.
Patient folder 12752/1 contains 635 images.
Patient folder 12810/0 contains 914 images.
Patient folder 12810/1 contains 252 images.
Patient folder 12811/0 contains 125 images.
Patient folder 12811/1 contains 126 images.
Patient folder 12817/0 contains 362 images.
Patient folder 12817/1 contains 572 images.
Patient folder 12818/0 contains 666 images.
Patient folder 12818/1 contains 945 images.
Patient folder 12819/0 contains 1404 images.
Patient folder 12819/1 contains 223 images.
Patient folder 12820/0 contains 753 images.
Patient folder 12820/1 contains 369 images.
Patient folder 12821/0 contains 1066 images.
Patient folder 12821/1 contains 319 images.
Patient folder 12822/0 contains 490 images.
Patient folder 12822/1 contains 271 images.
Patient folder 12823/0 contains 556 images.
Patient folder 12823/1 contains 447 images.
Patient folder 12824/0 contains 597 images.
Patient folder 12824/1 contains 110 images.
Patient folder 12826/0 contains 963 images.
Patient folder 12826/1 contains 174 images.
Patient folder 12867/0 contains 851 images.
Patient folder 12867/1 contains 575 images.
Patient folder 12868/0 contains 500 images.
Patient folder 12868/1 contains 361 images.
Patient folder 12869/0 contains 778 images.
Patient folder 12869/1 contains 18 images.
Patient folder 12870/0 contains 788 images.
Patient folder 12870/1 contains 41 images.
Patient folder 12871/0 contains 146 images.
Patient folder 12871/1 contains 36 images.
Patient folder 12872/0 contains 711 images.
Patient folder 12872/1 contains 69 images.
Patient folder 12873/0 contains 49 images.
Patient folder 12873/1 contains 232 images.
Patient folder 12875/0 contains 331 images.
Patient folder 12875/1 contains 43 images.
Patient folder 12876/0 contains 50 images.
Patient folder 12876/1 contains 105 images.
Patient folder 12877/0 contains 272 images.
Patient folder 12877/1 contains 33 images.
Patient folder 12878/0 contains 1289 images.
Patient folder 12878/1 contains 185 images.
Patient folder 12879/0 contains 272 images.
Patient folder 12879/1 contains 144 images.
Patient folder 12880/0 contains 788 images.
Patient folder 12880/1 contains 1147 images.
Patient folder 12881/0 contains 115 images.
Patient folder 12881/1 contains 158 images.
Patient folder 12882/0 contains 238 images.
Patient folder 12882/1 contains 154 images.
Patient folder 12883/0 contains 276 images.
Patient folder 12883/1 contains 73 images.
Patient folder 12884/0 contains 533 images.
Patient folder 12884/1 contains 236 images.
Patient folder 12886/0 contains 240 images.
Patient folder 12886/1 contains 287 images.
Patient folder 12890/0 contains 1313 images.
Patient folder 12890/1 contains 158 images.
Patient folder 12891/0 contains 442 images.
Patient folder 12891/1 contains 172 images.
Patient folder 12892/0 contains 133 images.
Patient folder 12892/1 contains 93 images.
Patient folder 12893/0 contains 216 images.
Patient folder 12893/1 contains 482 images.
Patient folder 12894/0 contains 1066 images.
Patient folder 12894/1 contains 650 images.
Patient folder 12895/0 contains 939 images.
Patient folder 12895/1 contains 741 images.
Patient folder 12896/0 contains 424 images.
Patient folder 12896/1 contains 83 images.
Patient folder 12897/0 contains 565 images.
Patient folder 12897/1 contains 296 images.
Patient folder 12898/0 contains 372 images.
Patient folder 12898/1 contains 208 images.
Patient folder 12900/0 contains 573 images.
Patient folder 12900/1 contains 450 images.
Patient folder 12901/0 contains 578 images.
Patient folder 12901/1 contains 230 images.
Patient folder 12905/0 contains 1193 images.
Patient folder 12905/1 contains 21 images.
Patient folder 12906/0 contains 816 images.
Patient folder 12906/1 contains 887 images.
Patient folder 12907/0 contains 546 images.
Patient folder 12907/1 contains 504 images.
Patient folder 12908/0 contains 778 images.
Patient folder 12908/1 contains 262 images.
Patient folder 12909/0 contains 283 images.
Patient folder 12909/1 contains 514 images.
Patient folder 12910/0 contains 1496 images.
Patient folder 12910/1 contains 222 images.
Patient folder 12911/0 contains 1041 images.
Patient folder 12911/1 contains 201 images.
Patient folder 12929/0 contains 90 images.
Patient folder 12929/1 contains 70 images.
Patient folder 12930/0 contains 835 images.
Patient folder 12930/1 contains 165 images.
Patient folder 12931/0 contains 477 images.
Patient folder 12931/1 contains 130 images.
Patient folder 12932/0 contains 433 images.
Patient folder 12932/1 contains 304 images.
Patient folder 12933/0 contains 125 images.
Patient folder 12933/1 contains 30 images.
Patient folder 12934/0 contains 1500 images.
Patient folder 12934/1 contains 504 images.
Patient folder 12935/0 contains 611 images.
Patient folder 12935/1 contains 615 images.
Patient folder 12947/0 contains 378 images.
Patient folder 12947/1 contains 452 images.
Patient folder 12948/0 contains 80 images.
Patient folder 12948/1 contains 87 images.
Patient folder 12949/0 contains 344 images.
Patient folder 12949/1 contains 464 images.
Patient folder 12951/0 contains 808 images.
Patient folder 12951/1 contains 330 images.
Patient folder 12954/0 contains 1945 images.
Patient folder 12954/1 contains 64 images.
Patient folder 12955/0 contains 809 images.
Patient folder 12955/1 contains 253 images.
Patient folder 13018/0 contains 175 images.
Patient folder 13018/1 contains 128 images.
Patient folder 13019/0 contains 1069 images.
Patient folder 13019/1 contains 441 images.
Patient folder 13020/0 contains 336 images.
Patient folder 13020/1 contains 50 images.
Patient folder 13021/0 contains 1089 images.
Patient folder 13021/1 contains 108 images.
Patient folder 13022/0 contains 1056 images.
Patient folder 13022/1 contains 286 images.
Patient folder 13023/0 contains 112 images.
Patient folder 13023/1 contains 115 images.
Patient folder 13024/0 contains 624 images.
Patient folder 13024/1 contains 186 images.
Patient folder 13025/0 contains 465 images.
Patient folder 13025/1 contains 296 images.
Patient folder 13106/0 contains 979 images.
Patient folder 13106/1 contains 155 images.
Patient folder 13400/0 contains 1299 images.
Patient folder 13400/1 contains 64 images.
Patient folder 13401/0 contains 323 images.
Patient folder 13401/1 contains 244 images.
Patient folder 13402/0 contains 292 images.
Patient folder 13402/1 contains 519 images.
Patient folder 13403/0 contains 201 images.
Patient folder 13403/1 contains 111 images.
Patient folder 13404/0 contains 379 images.
Patient folder 13404/1 contains 178 images.
Patient folder 13458/0 contains 394 images.
Patient folder 13458/1 contains 49 images.
Patient folder 13459/0 contains 802 images.
Patient folder 13459/1 contains 224 images.
Patient folder 13460/0 contains 623 images.
Patient folder 13460/1 contains 46 images.
Patient folder 13461/0 contains 594 images.
Patient folder 13461/1 contains 70 images.
Patient folder 13462/0 contains 1028 images.
Patient folder 13462/1 contains 726 images.
Patient folder 13591/0 contains 907 images.
Patient folder 13591/1 contains 128 images.
Patient folder 13613/0 contains 827 images.
Patient folder 13613/1 contains 630 images.
Patient folder 13616/0 contains 656 images.
Patient folder 13616/1 contains 701 images.
Patient folder 13617/0 contains 299 images.
Patient folder 13617/1 contains 56 images.
Patient folder 13666/0 contains 377 images.
Patient folder 13666/1 contains 30 images.
Patient folder 13687/0 contains 303 images.
Patient folder 13687/1 contains 151 images.
Patient folder 13688/0 contains 215 images.
Patient folder 13688/1 contains 127 images.
Patient folder 13689/0 contains 510 images.
Patient folder 13689/1 contains 76 images.
Patient folder 13691/0 contains 927 images.
Patient folder 13691/1 contains 264 images.
Patient folder 13692/0 contains 272 images.
Patient folder 13692/1 contains 335 images.
Patient folder 13693/0 contains 1935 images.
Patient folder 13693/1 contains 460 images.
Patient folder 13694/0 contains 360 images.
Patient folder 13694/1 contains 870 images.
Patient folder 13916/0 contains 1268 images.
Patient folder 13916/1 contains 365 images.
Patient folder 14078/0 contains 100 images.
Patient folder 14078/1 contains 121 images.
Patient folder 14079/0 contains 435 images.
Patient folder 14079/1 contains 455 images.
Patient folder 14081/0 contains 117 images.
Patient folder 14081/1 contains 209 images.
Patient folder 14082/0 contains 281 images.
Patient folder 14082/1 contains 197 images.
Patient folder 14153/0 contains 579 images.
Patient folder 14153/1 contains 210 images.
Patient folder 14154/0 contains 691 images.
Patient folder 14154/1 contains 829 images.
Patient folder 14155/0 contains 671 images.
Patient folder 14155/1 contains 1206 images.
Patient folder 14156/0 contains 1201 images.
Patient folder 14156/1 contains 197 images.
Patient folder 14157/0 contains 990 images.
Patient folder 14157/1 contains 488 images.
Patient folder 14188/0 contains 586 images.
Patient folder 14188/1 contains 123 images.
Patient folder 14189/0 contains 581 images.
Patient folder 14189/1 contains 421 images.
Patient folder 14190/0 contains 458 images.
Patient folder 14190/1 contains 491 images.
Patient folder 14191/0 contains 723 images.
Patient folder 14191/1 contains 617 images.
Patient folder 14192/0 contains 837 images.
Patient folder 14192/1 contains 195 images.
Patient folder 14209/0 contains 33 images.
Patient folder 14209/1 contains 309 images.
Patient folder 14210/0 contains 469 images.
Patient folder 14210/1 contains 104 images.
Patient folder 14211/0 contains 1287 images.
Patient folder 14211/1 contains 809 images.
Patient folder 14212/0 contains 167 images.
Patient folder 14212/1 contains 44 images.
Patient folder 14213/0 contains 169 images.
Patient folder 14213/1 contains 253 images.
Patient folder 14304/0 contains 410 images.
Patient folder 14304/1 contains 432 images.
Patient folder 14305/0 contains 714 images.
Patient folder 14305/1 contains 272 images.
Patient folder 14306/0 contains 264 images.
Patient folder 14306/1 contains 167 images.
Patient folder 14321/0 contains 426 images.
Patient folder 14321/1 contains 195 images.
Patient folder 15471/0 contains 448 images.
Patient folder 15471/1 contains 86 images.
Patient folder 15472/0 contains 1490 images.
Patient folder 15472/1 contains 214 images.
Patient folder 15473/0 contains 553 images.
Patient folder 15473/1 contains 885 images.
Patient folder 15510/0 contains 705 images.
Patient folder 15510/1 contains 356 images.
Patient folder 15512/0 contains 79 images.
Patient folder 15512/1 contains 143 images.
Patient folder 15513/0 contains 815 images.
Patient folder 15513/1 contains 54 images.
Patient folder 15514/0 contains 197 images.
Patient folder 15514/1 contains 441 images.
Patient folder 15515/0 contains 1051 images.
Patient folder 15515/1 contains 111 images.
Patient folder 15516/0 contains 1016 images.
Patient folder 15516/1 contains 275 images.
Patient folder 15632/0 contains 373 images.
Patient folder 15632/1 contains 120 images.
Patient folder 15633/0 contains 114 images.
Patient folder 15633/1 contains 337 images.
Patient folder 15634/0 contains 439 images.
Patient folder 15634/1 contains 370 images.
Patient folder 15839/0 contains 105 images.
Patient folder 15839/1 contains 134 images.
Patient folder 15840/0 contains 862 images.
Patient folder 15840/1 contains 244 images.
Patient folder 15902/0 contains 706 images.
Patient folder 15902/1 contains 461 images.
Patient folder 15903/0 contains 418 images.
Patient folder 15903/1 contains 621 images.
Patient folder 16014/0 contains 497 images.
Patient folder 16014/1 contains 209 images.
Patient folder 16085/0 contains 1913 images.
Patient folder 16085/1 contains 24 images.
Patient folder 16165/0 contains 937 images.
Patient folder 16165/1 contains 1174 images.
Patient folder 16166/0 contains 615 images.
Patient folder 16166/1 contains 675 images.
Patient folder 16167/0 contains 96 images.
Patient folder 16167/1 contains 96 images.
Patient folder 16531/0 contains 191 images.
Patient folder 16531/1 contains 58 images.
Patient folder 16532/0 contains 339 images.
Patient folder 16532/1 contains 128 images.
Patient folder 16533/0 contains 240 images.
Patient folder 16533/1 contains 127 images.
Patient folder 16534/0 contains 21 images.
Patient folder 16534/1 contains 42 images.
Patient folder 16550/0 contains 2115 images.
Patient folder 16550/1 contains 187 images.
Patient folder 16551/0 contains 1899 images.
Patient folder 16551/1 contains 284 images.
Patient folder 16552/0 contains 150 images.
Patient folder 16552/1 contains 37 images.
Patient folder 16553/0 contains 327 images.
Patient folder 16553/1 contains 353 images.
Patient folder 16554/0 contains 269 images.
Patient folder 16554/1 contains 448 images.
Patient folder 16555/0 contains 315 images.
Patient folder 16555/1 contains 85 images.
Patient folder 16568/0 contains 545 images.
Patient folder 16568/1 contains 283 images.
Patient folder 16569/0 contains 302 images.
Patient folder 16569/1 contains 35 images.
Patient folder 16570/0 contains 375 images.
Patient folder 16570/1 contains 542 images.
Patient folder 16895/0 contains 115 images.
Patient folder 16895/1 contains 36 images.
Patient folder 16896/0 contains 1017 images.
Patient folder 16896/1 contains 110 images.
Patient folder 8863/0 contains 772 images.
Patient folder 8863/1 contains 207 images.
Patient folder 8864/0 contains 805 images.
Patient folder 8864/1 contains 328 images.
Patient folder 8865/0 contains 657 images.
Patient folder 8865/1 contains 55 images.
Patient folder 8867/0 contains 1480 images.
Patient folder 8867/1 contains 162 images.
Patient folder 8913/0 contains 873 images.
Patient folder 8913/1 contains 82 images.
Patient folder 8914/0 contains 978 images.
Patient folder 8914/1 contains 75 images.
Patient folder 8916/0 contains 60 images.
Patient folder 8916/1 contains 111 images.
Patient folder 8917/0 contains 578 images.
Patient folder 8917/1 contains 397 images.
Patient folder 8918/0 contains 1421 images.
Patient folder 8918/1 contains 120 images.
Patient folder 8950/0 contains 420 images.
Patient folder 8950/1 contains 190 images.
Patient folder 8951/0 contains 433 images.
Patient folder 8951/1 contains 180 images.
Patient folder 8955/0 contains 314 images.
Patient folder 8955/1 contains 181 images.
Patient folder 8956/0 contains 1485 images.
Patient folder 8956/1 contains 340 images.
Patient folder 8957/0 contains 28 images.
Patient folder 8957/1 contains 83 images.
Patient folder 8959/0 contains 152 images.
Patient folder 8959/1 contains 204 images.
Patient folder 8974/0 contains 1372 images.
Patient folder 8974/1 contains 369 images.
Patient folder 8975/0 contains 1379 images.
Patient folder 8975/1 contains 833 images.
Patient folder 8980/0 contains 487 images.
Patient folder 8980/1 contains 209 images.
Patient folder 8984/0 contains 962 images.
Patient folder 8984/1 contains 156 images.
Patient folder 9022/0 contains 418 images.
Patient folder 9022/1 contains 99 images.
Patient folder 9023/0 contains 583 images.
Patient folder 9023/1 contains 288 images.
Patient folder 9029/0 contains 1497 images.
Patient folder 9029/1 contains 137 images.
Patient folder 9035/0 contains 185 images.
Patient folder 9035/1 contains 51 images.
Patient folder 9036/0 contains 1276 images.
Patient folder 9036/1 contains 30 images.
Patient folder 9037/0 contains 924 images.
Patient folder 9037/1 contains 188 images.
Patient folder 9041/0 contains 857 images.
Patient folder 9041/1 contains 178 images.
Patient folder 9043/0 contains 276 images.
Patient folder 9043/1 contains 560 images.
Patient folder 9044/0 contains 112 images.
Patient folder 9044/1 contains 46 images.
Patient folder 9073/0 contains 771 images.
Patient folder 9073/1 contains 63 images.
Patient folder 9075/0 contains 1420 images.
Patient folder 9075/1 contains 361 images.
Patient folder 9076/0 contains 832 images.
Patient folder 9076/1 contains 159 images.
Patient folder 9077/0 contains 360 images.
Patient folder 9077/1 contains 1263 images.
Patient folder 9078/0 contains 1602 images.
Patient folder 9078/1 contains 186 images.
Patient folder 9081/0 contains 681 images.
Patient folder 9081/1 contains 180 images.
Patient folder 9083/0 contains 373 images.
Patient folder 9083/1 contains 198 images.
Patient folder 9123/0 contains 1427 images.
Patient folder 9123/1 contains 161 images.
Patient folder 9124/0 contains 175 images.
Patient folder 9124/1 contains 290 images.
Patient folder 9125/0 contains 369 images.
Patient folder 9125/1 contains 239 images.
Patient folder 9126/0 contains 1147 images.
Patient folder 9126/1 contains 447 images.
Patient folder 9135/0 contains 604 images.
Patient folder 9135/1 contains 111 images.
Patient folder 9173/0 contains 1020 images.
Patient folder 9173/1 contains 485 images.
Patient folder 9174/0 contains 190 images.
Patient folder 9174/1 contains 29 images.
Patient folder 9175/0 contains 108 images.
Patient folder 9175/1 contains 10 images.
Patient folder 9176/0 contains 636 images.
Patient folder 9176/1 contains 409 images.
Patient folder 9177/0 contains 842 images.
Patient folder 9177/1 contains 261 images.
Patient folder 9178/0 contains 1283 images.
Patient folder 9178/1 contains 160 images.
Patient folder 9181/0 contains 915 images.
Patient folder 9181/1 contains 161 images.
Patient folder 9225/0 contains 1454 images.
Patient folder 9225/1 contains 89 images.
Patient folder 9226/0 contains 817 images.
Patient folder 9226/1 contains 421 images.
Patient folder 9227/0 contains 687 images.
Patient folder 9227/1 contains 127 images.
Patient folder 9228/0 contains 412 images.
Patient folder 9228/1 contains 71 images.
Patient folder 9250/0 contains 636 images.
Patient folder 9250/1 contains 515 images.
Patient folder 9254/0 contains 999 images.
Patient folder 9254/1 contains 173 images.
Patient folder 9255/0 contains 838 images.
Patient folder 9255/1 contains 388 images.
Patient folder 9256/0 contains 489 images.
Patient folder 9256/1 contains 442 images.
Patient folder 9257/0 contains 1001 images.
Patient folder 9257/1 contains 201 images.
Patient folder 9258/0 contains 264 images.
Patient folder 9258/1 contains 334 images.
Patient folder 9259/0 contains 1174 images.
Patient folder 9259/1 contains 225 images.
Patient folder 9260/0 contains 236 images.
Patient folder 9260/1 contains 126 images.
Patient folder 9261/0 contains 521 images.
Patient folder 9261/1 contains 446 images.
Patient folder 9262/0 contains 14 images.
Patient folder 9262/1 contains 80 images.
Patient folder 9265/0 contains 1636 images.
Patient folder 9265/1 contains 41 images.
Patient folder 9266/0 contains 1119 images.
Patient folder 9266/1 contains 52 images.
Patient folder 9267/0 contains 337 images.
Patient folder 9267/1 contains 321 images.
Patient folder 9290/0 contains 1368 images.
Patient folder 9290/1 contains 174 images.
Patient folder 9291/0 contains 733 images.
Patient folder 9291/1 contains 96 images.
Patient folder 9319/0 contains 385 images.
Patient folder 9319/1 contains 31 images.
Patient folder 9320/0 contains 1453 images.
Patient folder 9320/1 contains 451 images.
Patient folder 9321/0 contains 282 images.
Patient folder 9321/1 contains 30 images.
Patient folder 9322/0 contains 1295 images.
Patient folder 9322/1 contains 167 images.
Patient folder 9323/0 contains 1938 images.
Patient folder 9323/1 contains 278 images.
Patient folder 9324/0 contains 720 images.
Patient folder 9324/1 contains 322 images.
Patient folder 9325/0 contains 1060 images.
Patient folder 9325/1 contains 68 images.
Patient folder 9344/0 contains 225 images.
Patient folder 9344/1 contains 310 images.
Patient folder 9345/0 contains 554 images.
Patient folder 9345/1 contains 631 images.
Patient folder 9346/0 contains 634 images.
Patient folder 9346/1 contains 727 images.
Patient folder 9347/0 contains 359 images.
Patient folder 9347/1 contains 51 images.
Patient folder 9381/0 contains 1198 images.
Patient folder 9381/1 contains 128 images.
Patient folder 9382/0 contains 1306 images.
Patient folder 9382/1 contains 346 images.
Patient folder 9383/0 contains 494 images.
Patient folder 9383/1 contains 70 images.
In [192]:
print(f'Total number of images in all subfolders is {total_images}.')
Total number of images in all subfolders is 277524.
In [193]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread  # or from PIL import Image if you prefer
In [194]:
patient_folders = os.listdir(patient_data_folder)  # List of patient folder names

# Initialize an empty list to collect data
data_list = []

# Iterate over patient folders and their subfolders
for patient_id in patient_folders:
    patient_path = patient_data_folder / patient_id
    for target in ['0', '1']:
        class_path = patient_path / target
        image_files = os.listdir(class_path)
        for image_file in image_files:
            image_path = class_path / image_file
            # Append a new dict to the list
            data_list.append({
                "patient_id": patient_id,
                "path": str(image_path),
                "target": int(target)
            })

# Create the DataFrame from the list of dicts
dataset = pd.DataFrame(data_list)

dataset.head()
Out[194]:
patient_id path target
0 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
1 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
2 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
3 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
4 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
In [195]:
dataset.shape
Out[195]:
(277524, 3)

EDA¶

In [196]:
dataset.target.value_counts()
Out[196]:
target
0    198738
1     78786
Name: count, dtype: int64
In [197]:
import plotly.graph_objects as go

# Calculate the percentage of cancerous patches for each patient
cancer_perc = dataset.groupby("patient_id")['target'].value_counts(normalize=True).unstack()

# Number of patches per patient
patches_per_patient = dataset.groupby("patient_id").size()

# Plot 1: Histogram of the number of patches per patient
fig1 = go.Figure(go.Histogram(x=patches_per_patient, nbinsx=30, marker_color="steelblue"))
fig1.update_layout(title_text="How many patches do we have per patient?", xaxis_title="Number of Patches", yaxis_title="Frequency", height=600, width=800)
fig1.update_traces(marker_line_width=1, marker_line_color="black", opacity=0.8)
fig1.show()

# Plot 2: Histogram of the percentage of cancerous patches per patient
# Ensure there is a '1' column for cancerous patches; if not, create it with 0 values
if 1 not in cancer_perc.columns:
    cancer_perc[1] = 0
fig2 = go.Figure(go.Histogram(x=cancer_perc[1], nbinsx=30, marker_color="mediumseagreen"))
fig2.update_layout(title_text="How much percentage of an image is covered by IDC?", xaxis_title="% of Patches with IDC", yaxis_title="Frequency", height=600, width=800)
fig2.update_traces(marker_line_width=1, marker_line_color="black", opacity=0.8)
fig2.show()

# Plot 3: Count plot of non-cancerous vs. cancerous patches
fig3 = go.Figure(go.Histogram(x=dataset['target'], nbinsx=2, marker_color=["darkorchid", "darkorange"]))
fig3.update_layout(title_text="How many patches show IDC?", xaxis_title="No (0) vs Yes (1)", yaxis_title="Count", height=600, width=800)
fig3.update_traces(marker_line_width=1, marker_line_color="black", opacity=0.8)
fig3.show()

Examining the Patches¶

In [198]:
healthy = np.random.choice(dataset[dataset.target==0].index.values, size=50, replace=False)
non_healthy = np.random.choice(dataset[dataset.target==1].index.values, size=50, replace=False)

Healthy Patches

In [199]:
fig, ax = plt.subplots(5, 10, figsize=(20, 10))

for n in range(5):
    for m in range(10):
        idx = healthy[m + 10*n]  # Index of the sample to display
        image_path = dataset.loc[idx, "path"]  # Get the path of the image
        image = imread(image_path)  # Load the image
        ax[n, m].imshow(image)  # Display the image
        ax[n, m].axis('off')  # Hide the axes

plt.show()
No description has been provided for this image

Cancer Patches¶

In [200]:
fig, ax = plt.subplots(5, 10, figsize=(20, 10))

for n in range(5):
    for m in range(10):
        idx = non_healthy[m + 10*n]  # Index of the sample to display
        image_path = dataset.loc[idx, "path"]  # Get the path of the image
        image = imread(image_path)  # Load the image
        ax[n, m].imshow(image)  # Display the image
        ax[n, m].axis('off')  # Hide the axes

plt.show()
No description has been provided for this image

So far we have looked at each patch of the breast, now let us attempt to visualize the entire breast tissue.

In [201]:
import pandas as pd
import numpy as np

def extract_coords(df):
    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    df_copy = df.copy()
    
    # Extract the x and y coordinates from the file paths
    coord = df_copy['path'].str.rsplit("_", n=4, expand=True)
    coord = coord.drop([0, 1, 4], axis=1)  # Drop unused parts
    coord = coord.rename({2: "x", 3: "y"}, axis=1)  # Rename columns to 'x' and 'y'
    coord['x'] = coord['x'].str.replace("x", "", case=False).astype(int)  # Convert x values to integers
    coord['y'] = coord['y'].str.replace("y", "", case=False).astype(int)  # Convert y values to integers
    
    # Merge the coordinates back into the original DataFrame using .loc to avoid SettingWithCopyWarning
    df_copy.loc[:, 'x'] = coord['x']
    df_copy.loc[:, 'y'] = coord['y']
    
    return df_copy

def get_patient_dataframe(patient_id, dataset):
    # Filter the dataset for the given patient_id
    patient_df = dataset[dataset['patient_id'] == patient_id].copy()
    
    # Extract coordinates and targets
    patient_df = extract_coords(patient_df)
    
    return patient_df
In [202]:
get_patient_dataframe("10253", dataset).head()
Out[202]:
patient_id path target x y
0 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0 1001 1001
1 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0 1001 1051
2 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0 1001 1101
3 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0 1001 1151
4 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0 1001 1201

Binary target visualisation per tissue slice

In [203]:
fig, ax = plt.subplots(5, 3, figsize=(20, 27))

# Get unique patient IDs from your dataset
patient_ids = dataset.patient_id.unique()

for n in range(5):
    for m in range(3):
        patient_id = patient_ids[m + 3*n]  # Select patient ID
        example_df = get_patient_dataframe(patient_id, dataset)  # Get the DataFrame for this patient
        
        # Plot scatter plot of x-y coordinates colored by target
        ax[n, m].scatter(example_df.x.values, example_df.y.values, c=example_df.target.values, cmap="coolwarm", s=20)
        ax[n, m].set_title("patient " + patient_id)
        ax[n, m].set_xlabel("y coord")
        ax[n, m].set_ylabel("x coord")
No description has been provided for this image

Insights Sometimes we don't have the full tissue information. It seems that tissue patches have been discarded or lost during preparation. Reading the paper (link!) that seems to be related to this data this could also be part of the preprocessing.

Visualising the breast tissue images Ok, now it's time to go one step deeper with our EDA. Given the coordinates of image patches we could try to reconstruct the whole tissue image (not only the targets).

In [204]:
def visualise_breast_tissue(patient_id, pred_df=None):
    example_df = get_patient_dataframe(patient_id, dataset)
    max_point = [example_df.y.max() - 1, example_df.x.max() - 1]
    grid = 255 * np.ones(shape=(max_point[0] + 50, max_point[1] + 50, 3)).astype(np.uint8)
    mask = 255 * np.ones(shape=(max_point[0] + 50, max_point[1] + 50, 3)).astype(np.uint8)
    

    if pred_df is not None:
        patient_df = pred_df[pred_df['patient_id'] == patient_id].copy()
    mask_proba = np.zeros(shape=(max_point[0] + 50, max_point[1] + 50, 1)).astype(float)

    broken_patches = []
    for n in range(len(example_df)):
        try:
            image = imread(example_df.path.values[n])
            # Convert the image from normalized floats to uint8
            image = (image * 255).astype(np.uint8)

            target = example_df.target.values[n]
            
            x_coord, y_coord = int(example_df.x.values[n]), int(example_df.y.values[n])
            x_start, y_start = x_coord - 1, y_coord - 1
            x_end, y_end = x_start + 50, y_start + 50

            grid[y_start:y_end, x_start:x_end] = image
           
            if target == 1:
                mask[y_start:y_end, x_start:x_end, 0] = 250
                mask[y_start:y_end, x_start:x_end, 1] = 0
                mask[y_start:y_end, x_start:x_end, 2] = 0
            if pred_df is not None:
                proba = patient_df[(patient_df['x'] == x_coord) & (patient_df['y'] == y_coord)]['proba']
                mask_proba[y_start:y_end, x_start:x_end, 0] = float(proba)

        except ValueError:
            broken_patches.append(example_df.iloc[n]['path'])

    return grid, mask, broken_patches, mask_proba
In [205]:
patient_id = '10262'
grid, mask, broken_patches, mask_proba = visualise_breast_tissue(patient_id)

fig, ax = plt.subplots(1,2,figsize=(20,10))
ax[0].imshow(grid, alpha=0.9)
ax[1].imshow(mask, alpha=0.8)
ax[1].imshow(grid, alpha=0.7)
ax[0].grid(False)
ax[1].grid(False)
for m in range(2):
    ax[m].set_xlabel("y-coord")
    ax[m].set_ylabel("y-coord")
ax[0].set_title("Breast tissue slice of patient: " + patient_id)
ax[1].set_title("Cancer tissue colored red \n of patient: " + patient_id);
No description has been provided for this image

Splitting the Data¶

In [206]:
dataset
Out[206]:
patient_id path target
0 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
1 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
2 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
3 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
4 10253 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 0
... ... ... ...
277519 9383 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 1
277520 9383 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 1
277521 9383 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 1
277522 9383 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 1
277523 9383 ..\..\..\..\..\Downloads\hispathology\IDC_regu... 1

277524 rows × 3 columns

In [207]:
# Separate the dataset into two based on the target value
class_0_df = dataset[dataset['target'] == 0]
class_1_df = dataset[dataset['target'] == 1]

# Sample 2500 instances from each class
class_0_sample = class_0_df.sample(78786, random_state=42)
class_1_sample = class_1_df.sample(78786, random_state=42)

# Concatenate the two samples to create a balanced smaller dataset
small_dataset = pd.concat([class_0_sample, class_1_sample])

# Shuffle the small_dataset to mix class_0 and class_1 samples
small_dataset = small_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
In [208]:
X = small_dataset.drop(columns=['target'])
y = small_dataset['target'].astype(str)
y = pd.DataFrame(y)
In [209]:
# show the rgb values of the images
In [210]:
from sklearn.model_selection import train_test_split

# First, split into train and temp (temp will be further split into validation and test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Now split the temp set into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
In [211]:
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
In [212]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# train_datagen = ImageDataGenerator(
#     # rescale=1./255,
#     horizontal_flip=True,
#     vertical_flip=True,
#     # Add other transformations here
# )

train_datagen = ImageDataGenerator(
    rescale=1./255,
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    rotation_range=20,  # randomly rotate images in the range (degrees, 0 to 180)
    width_shift_range=0.2,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.2,  # randomly shift images vertically (fraction of total height)
    horizontal_flip=True,  # randomly flip images
    vertical_flip=True
)

val_datagen = ImageDataGenerator(rescale=1./255)
In [213]:
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col="path",
    y_col="target",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    target_size=(180,180))

valid_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col="path",
    y_col="target",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    target_size=(180,180))

test_generator = val_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col="path",
    y_col="target",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    target_size=(180,180))
Found 110300 validated image filenames belonging to 2 classes.
Found 23636 validated image filenames belonging to 2 classes.
Found 23636 validated image filenames belonging to 2 classes.
In [276]:
test_generator = val_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col="path",
    y_col="target",
    batch_size=32,
    seed=42,
    shuffle=False,
    class_mode="categorical",
    target_size=(180,180))
Found 23636 validated image filenames belonging to 2 classes.
In [214]:
# Fetch the first batch of images and labels
images, labels = next(train_generator)

# Inspect the first image in the batch
first_image = images[0]

# Check the maximum and minimum values
max_value = first_image.max()
min_value = first_image.min()

print(f"Maximum pixel value: {max_value}")
print(f"Minimum pixel value: {min_value}")

images
Maximum pixel value: 0.9764706492424011
Minimum pixel value: 0.21176472306251526
Out[214]:
array([[[[0.8313726 , 0.5411765 , 0.63529414],
         [0.8313726 , 0.5411765 , 0.63529414],
         [0.8313726 , 0.5411765 , 0.63529414],
         ...,
         [0.94117653, 0.9450981 , 0.9568628 ],
         [0.94406515, 0.9450981 , 0.9568628 ],
         [0.9469896 , 0.9450981 , 0.9568628 ]],

        [[0.8313726 , 0.5411765 , 0.63529414],
         [0.8313726 , 0.5411765 , 0.63529414],
         [0.8313726 , 0.5411765 , 0.63529414],
         ...,
         [0.9525361 , 0.9450981 , 0.9568628 ],
         [0.95294124, 0.9450981 , 0.9568628 ],
         [0.95294124, 0.9450981 , 0.9568628 ]],

        [[0.8313726 , 0.5411765 , 0.63529414],
         [0.8313726 , 0.5411765 , 0.63529414],
         [0.8313726 , 0.5411765 , 0.63529414],
         ...,
         [0.95294124, 0.9450981 , 0.9568628 ],
         [0.95294124, 0.9450981 , 0.9568628 ],
         [0.95294124, 0.9450981 , 0.9568628 ]],

        ...,

        [[0.92549026, 0.79215693, 0.85098046],
         [0.92549026, 0.79215693, 0.85098046],
         [0.92549026, 0.79215693, 0.85098046],
         ...,
         [0.91372555, 0.6431373 , 0.74509805],
         [0.91341007, 0.65165544, 0.7495149 ],
         [0.909804  , 0.7490196 , 0.8000001 ]],

        [[0.92549026, 0.79215693, 0.85098046],
         [0.92549026, 0.79215693, 0.85098046],
         [0.92549026, 0.79215693, 0.85098046],
         ...,
         [0.91372555, 0.6431373 , 0.74509805],
         [0.91372555, 0.6431373 , 0.74509805],
         [0.9105864 , 0.72789437, 0.7890462 ]],

        [[0.92549026, 0.79215693, 0.85098046],
         [0.92549026, 0.79215693, 0.85098046],
         [0.92549026, 0.79215693, 0.85098046],
         ...,
         [0.91372555, 0.6431373 , 0.74509805],
         [0.91372555, 0.6431373 , 0.74509805],
         [0.9115612 , 0.7015744 , 0.7753988 ]]],


       [[[0.7019608 , 0.28235295, 0.44705886],
         [0.64974254, 0.2927966 , 0.47142738],
         [0.6431373 , 0.29411766, 0.47450984],
         ...,
         [0.8431373 , 0.8117648 , 0.8431373 ],
         [0.8431373 , 0.8117648 , 0.8431373 ],
         [0.8431373 , 0.8117648 , 0.8431373 ]],

        [[0.7019608 , 0.28235295, 0.44705886],
         [0.6571902 , 0.2913071 , 0.46795177],
         [0.6431373 , 0.29411766, 0.47450984],
         ...,
         [0.8431373 , 0.8117648 , 0.8431373 ],
         [0.8431373 , 0.8117648 , 0.8431373 ],
         [0.8431373 , 0.8117648 , 0.8431373 ]],

        [[0.7019608 , 0.28235295, 0.44705886],
         [0.6646379 , 0.28981754, 0.4644762 ],
         [0.6431373 , 0.29411766, 0.47450984],
         ...,
         [0.8431373 , 0.8117648 , 0.8431373 ],
         [0.8431373 , 0.8117648 , 0.8431373 ],
         [0.8431373 , 0.8117648 , 0.8431373 ]],

        ...,

        [[0.8352942 , 0.654902  , 0.7568628 ],
         [0.8371669 , 0.6680114 , 0.77371776],
         [0.83921576, 0.68235296, 0.79215693],
         ...,
         [0.7725491 , 0.4666667 , 0.6117647 ],
         [0.7725491 , 0.4666667 , 0.6117647 ],
         [0.7731029 , 0.46168223, 0.6070572 ]],

        [[0.8352942 , 0.654902  , 0.7568628 ],
         [0.83667046, 0.6645359 , 0.76924914],
         [0.83921576, 0.68235296, 0.79215693],
         ...,
         [0.77889687, 0.40953654, 0.55780846],
         [0.7798899 , 0.40059927, 0.5493677 ],
         [0.7803922 , 0.39607847, 0.54509807]],

        [[0.8629043 , 0.73006296, 0.8059475 ],
         [0.86725533, 0.74141955, 0.81563336],
         [0.85437727, 0.7000415 , 0.7946838 ],
         ...,
         [0.7803922 , 0.39607847, 0.54509807],
         [0.7803922 , 0.39607847, 0.54509807],
         [0.7803922 , 0.39607847, 0.54509807]]],


       [[[0.8862746 , 0.6901961 , 0.75294125],
         [0.8862746 , 0.6901961 , 0.75294125],
         [0.8862746 , 0.6901961 , 0.75294125],
         ...,
         [0.8980393 , 0.6313726 , 0.72156864],
         [0.8980393 , 0.6510014 , 0.7355892 ],
         [0.8980393 , 0.65882355, 0.7411765 ]],

        [[0.8862746 , 0.6901961 , 0.75294125],
         [0.8862746 , 0.6901961 , 0.75294125],
         [0.8862746 , 0.6901961 , 0.75294125],
         ...,
         [0.8980393 , 0.6313726 , 0.72156864],
         [0.8980393 , 0.6549626 , 0.7384187 ],
         [0.8980393 , 0.65882355, 0.7411765 ]],

        [[0.8862746 , 0.6901961 , 0.75294125],
         [0.8862746 , 0.6901961 , 0.75294125],
         [0.8862746 , 0.6901961 , 0.75294125],
         ...,
         [0.8980393 , 0.63176024, 0.72184557],
         [0.8980393 , 0.65882355, 0.7411765 ],
         [0.8980393 , 0.65882355, 0.7411765 ]],

        ...,

        [[0.8705883 , 0.5647059 , 0.6392157 ],
         [0.8705883 , 0.5647059 , 0.6392157 ],
         [0.8705883 , 0.5647059 , 0.6392157 ],
         ...,
         [0.8980393 , 0.5411765 , 0.6392157 ],
         [0.8980393 , 0.5411765 , 0.6392157 ],
         [0.8980393 , 0.5411765 , 0.6392157 ]],

        [[0.8705883 , 0.5647059 , 0.6392157 ],
         [0.8705883 , 0.5647059 , 0.6392157 ],
         [0.8705883 , 0.5647059 , 0.6392157 ],
         ...,
         [0.8980393 , 0.5411765 , 0.6392157 ],
         [0.8980393 , 0.5411765 , 0.6392157 ],
         [0.8980393 , 0.5411765 , 0.6392157 ]],

        [[0.86211634, 0.5675299 , 0.64957035],
         [0.8672093 , 0.56583226, 0.64334553],
         [0.8705883 , 0.5647059 , 0.6392157 ],
         ...,
         [0.8980393 , 0.5411765 , 0.6392157 ],
         [0.8980393 , 0.5411765 , 0.6392157 ],
         [0.8980393 , 0.5411765 , 0.6392157 ]]],


       ...,


       [[[0.5553125 , 0.39394477, 0.54509807],
         [0.5686275 , 0.43921572, 0.54509807],
         [0.5686275 , 0.43921572, 0.54509807],
         ...,
         [0.7058824 , 0.40000004, 0.5764706 ],
         [0.7058824 , 0.40000004, 0.5764706 ],
         [0.7058824 , 0.40000004, 0.5764706 ]],

        [[0.56081676, 0.4126594 , 0.54509807],
         [0.5686275 , 0.43921572, 0.54509807],
         [0.5686275 , 0.43921572, 0.54509807],
         ...,
         [0.7058824 , 0.40000004, 0.5764706 ],
         [0.7058824 , 0.40000004, 0.5764706 ],
         [0.7058824 , 0.40000004, 0.5764706 ]],

        [[0.57442456, 0.43658486, 0.5547858 ],
         [0.5686275 , 0.43921572, 0.54509807],
         [0.5686275 , 0.43921572, 0.54509807],
         ...,
         [0.7058824 , 0.40000004, 0.5764706 ],
         [0.7058824 , 0.40000004, 0.5764706 ],
         [0.7058824 , 0.40000004, 0.5764706 ]],

        ...,

        [[0.7686275 , 0.5058824 , 0.6627451 ],
         [0.7686275 , 0.5058824 , 0.6627451 ],
         [0.6012408 , 0.40763363, 0.56813526],
         ...,
         [0.49812543, 0.37160367, 0.5073867 ],
         [0.42767027, 0.34298125, 0.4798651 ],
         [0.4039216 , 0.33333334, 0.47058827]],

        [[0.7686275 , 0.5058824 , 0.6627451 ],
         [0.7237397 , 0.4795352 , 0.6373738 ],
         [0.5882353 , 0.40000004, 0.56078434],
         ...,
         [0.654902  , 0.43529415, 0.5686275 ],
         [0.654902  , 0.43529415, 0.5686275 ],
         [0.5981036 , 0.41221976, 0.5464406 ]],

        [[0.7686275 , 0.5058824 , 0.6627451 ],
         [0.67310005, 0.44981194, 0.60875136],
         [0.5882353 , 0.40000004, 0.56078434],
         ...,
         [0.654902  , 0.43529415, 0.5686275 ],
         [0.654902  , 0.43529415, 0.5686275 ],
         [0.654902  , 0.43529415, 0.5686275 ]]],


       [[[0.8431373 , 0.59607846, 0.7176471 ],
         [0.8431373 , 0.59607846, 0.7176471 ],
         [0.8431373 , 0.59607846, 0.7176471 ],
         ...,
         [0.8352942 , 0.6       , 0.72156864],
         [0.8352942 , 0.6       , 0.72156864],
         [0.81441814, 0.5194781 , 0.6589405 ]],

        [[0.8431373 , 0.59607846, 0.7176471 ],
         [0.8431373 , 0.59607846, 0.7176471 ],
         [0.8431373 , 0.59607846, 0.7176471 ],
         ...,
         [0.8352942 , 0.6       , 0.72156864],
         [0.8352942 , 0.6       , 0.72156864],
         [0.81191206, 0.50981194, 0.6514223 ]],

        [[0.8431373 , 0.59607846, 0.7176471 ],
         [0.8431373 , 0.59607846, 0.7176471 ],
         [0.8431373 , 0.59607846, 0.7176471 ],
         ...,
         [0.8352942 , 0.6       , 0.72156864],
         [0.8352942 , 0.6       , 0.72156864],
         [0.80940604, 0.50014573, 0.64390427]],

        ...,

        [[0.82088965, 0.57042587, 0.7017123 ],
         [0.8233957 , 0.577228  , 0.7077984 ],
         [0.82590175, 0.58403015, 0.7138846 ],
         ...,
         [0.8965566 , 0.7303076 , 0.8023785 ],
         [0.90196085, 0.74509805, 0.8117648 ],
         [0.90196085, 0.74509805, 0.8117648 ]],

        [[0.8000001 , 0.5137255 , 0.6509804 ],
         [0.8000001 , 0.5137255 , 0.6509804 ],
         [0.8000001 , 0.5137255 , 0.6509804 ],
         ...,
         [0.90196085, 0.74509805, 0.8117648 ],
         [0.90196085, 0.74509805, 0.8117648 ],
         [0.90196085, 0.74509805, 0.8117648 ]],

        [[0.8000001 , 0.5137255 , 0.6509804 ],
         [0.8000001 , 0.5137255 , 0.6509804 ],
         [0.8000001 , 0.5137255 , 0.6509804 ],
         ...,
         [0.9047858 , 0.7572455 , 0.8196747 ],
         [0.90196085, 0.74509805, 0.8117648 ],
         [0.90196085, 0.74509805, 0.8117648 ]]],


       [[[0.8470589 , 0.6901961 , 0.7960785 ],
         [0.8470589 , 0.6901961 , 0.7960785 ],
         [0.84670603, 0.689599  , 0.7956985 ],
         ...,
         [0.6431373 , 0.40784317, 0.59607846],
         [0.6431373 , 0.40784317, 0.59607846],
         [0.6431373 , 0.40784317, 0.59607846]],

        [[0.8470589 , 0.6901961 , 0.7960785 ],
         [0.8470589 , 0.6901961 , 0.7960785 ],
         [0.8470589 , 0.6901961 , 0.7960785 ],
         ...,
         [0.6431373 , 0.40784317, 0.59607846],
         [0.6431373 , 0.40784317, 0.59607846],
         [0.6431373 , 0.40784317, 0.59607846]],

        [[0.8470589 , 0.6901961 , 0.7960785 ],
         [0.8470589 , 0.6901961 , 0.7960785 ],
         [0.8470589 , 0.6901961 , 0.7960785 ],
         ...,
         [0.6431373 , 0.40784317, 0.59607846],
         [0.6431373 , 0.40784317, 0.59607846],
         [0.6431373 , 0.40784317, 0.59607846]],

        ...,

        [[0.7779189 , 0.4222036 , 0.59674317],
         [0.77647066, 0.4156863 , 0.5921569 ],
         [0.77647066, 0.4156863 , 0.5921569 ],
         ...,
         [0.8235295 , 0.5058824 , 0.6627451 ],
         [0.8235295 , 0.5058824 , 0.6627451 ],
         [0.8235295 , 0.5058824 , 0.6627451 ]],

        [[0.77647066, 0.4156863 , 0.5921569 ],
         [0.77647066, 0.4156863 , 0.5921569 ],
         [0.77647066, 0.4156863 , 0.5921569 ],
         ...,
         [0.8235295 , 0.5058824 , 0.6627451 ],
         [0.8235295 , 0.5058824 , 0.6627451 ],
         [0.8235295 , 0.5058824 , 0.6627451 ]],

        [[0.77647066, 0.4156863 , 0.5921569 ],
         [0.77647066, 0.4156863 , 0.5921569 ],
         [0.77647066, 0.4156863 , 0.5921569 ],
         ...,
         [0.8235295 , 0.5058824 , 0.6627451 ],
         [0.8235295 , 0.5058824 , 0.6627451 ],
         [0.8235295 , 0.5058824 , 0.6627451 ]]]], dtype=float32)
In [215]:
# from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=42)
# X_res, y_res = ros.fit_resample(X.reshape(X.shape[0], -1), y)  # Reshape X to 2D for resampling
# X_res = X_res.reshape(-1, 50, 50, 3)  # Reshape X back to its original shape

Modelling¶

In [216]:
# import conv2d, maxpooling2d, flatten, dense, dropout
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(180, 180, 3)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
    optimizer=keras.optimizers.Adadelta(),
    metrics=['accuracy'])
In [217]:
model.summary()
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d_6 (Conv2D)           (None, 178, 178, 32)      896       
                                                                 
 conv2d_7 (Conv2D)           (None, 176, 176, 64)      18496     
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 88, 88, 64)       0         
 2D)                                                             
                                                                 
 dropout_14 (Dropout)        (None, 88, 88, 64)        0         
                                                                 
 flatten_9 (Flatten)         (None, 495616)            0         
                                                                 
 dense_18 (Dense)            (None, 128)               63438976  
                                                                 
 dropout_15 (Dropout)        (None, 128)               0         
                                                                 
 dense_19 (Dense)            (None, 2)                 258       
                                                                 
=================================================================
Total params: 63,458,626
Trainable params: 63,458,626
Non-trainable params: 0
_________________________________________________________________
In [218]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath="./models/convnet_from_scratch_with_augmentation.keras",
        save_best_only=True,
        monitor="val_loss")
]

history = model.fit(
    train_generator,
    batch_size=1024,
    epochs=8,
    validation_data=valid_generator,
    callbacks=callbacks)
Epoch 1/8
3447/3447 [==============================] - 643s 186ms/step - loss: 0.5428 - accuracy: 0.7374 - val_loss: 0.4756 - val_accuracy: 0.7891
Epoch 2/8
3447/3447 [==============================] - 598s 173ms/step - loss: 0.4947 - accuracy: 0.7796 - val_loss: 0.4714 - val_accuracy: 0.7899
Epoch 3/8
3447/3447 [==============================] - 611s 177ms/step - loss: 0.4854 - accuracy: 0.7855 - val_loss: 0.4615 - val_accuracy: 0.7969
Epoch 4/8
3447/3447 [==============================] - 615s 178ms/step - loss: 0.4807 - accuracy: 0.7882 - val_loss: 0.4595 - val_accuracy: 0.7976
Epoch 5/8
3447/3447 [==============================] - 599s 174ms/step - loss: 0.4769 - accuracy: 0.7896 - val_loss: 0.4583 - val_accuracy: 0.7979
Epoch 6/8
3447/3447 [==============================] - 592s 172ms/step - loss: 0.4751 - accuracy: 0.7912 - val_loss: 0.4566 - val_accuracy: 0.7995
Epoch 7/8
3447/3447 [==============================] - 601s 174ms/step - loss: 0.4720 - accuracy: 0.7915 - val_loss: 0.4515 - val_accuracy: 0.8001
Epoch 8/8
3447/3447 [==============================] - 665s 193ms/step - loss: 0.4712 - accuracy: 0.7920 - val_loss: 0.4505 - val_accuracy: 0.8002
In [268]:
history_df = pd.DataFrame(history.history)
history_df.insert(0, 'epoch', range(1, len(history_df) + 1))
history_df
Out[268]:
epoch loss accuracy val_loss val_accuracy
0 1 0.542761 0.737353 0.475618 0.789051
1 2 0.494709 0.779556 0.471389 0.789854
2 3 0.485370 0.785494 0.461452 0.796920
3 4 0.480675 0.788196 0.459515 0.797597
4 5 0.476892 0.789637 0.458342 0.797851
5 6 0.475120 0.791160 0.456600 0.799543
6 7 0.472008 0.791469 0.451499 0.800093
7 8 0.471177 0.792004 0.450510 0.800178
In [269]:
# Create a DataFrame from the history object
history_df = pd.DataFrame(history.history)

# Plot the training and validation loss
plt.figure(figsize=(9, 5))
values = history_df['accuracy']
epochs = range(1, len(values) + 1)
plt.plot(epochs, history_df['loss'], 'bo', label='Training loss')
plt.plot(epochs, history_df['val_loss'], 'ro', label='Validation loss')

plt.xlabel('Epochs')
plt.xticks(epochs)
plt.ylabel('Loss')
plt.legend()
plt.title('Training and validation loss')
plt.show()

# Plot the training and validation accuracy
plt.figure(figsize=(9, 5))
plt.plot(epochs, history_df['accuracy'], 'bo', label='Training accuracy')
plt.plot(epochs, history_df['val_accuracy'], 'ro', label='Validation accuracy')

plt.xlabel('Epochs')
plt.xticks(epochs)
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and validation accuracy')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [277]:
# evaluate the model
model.evaluate(test_generator)
739/739 [==============================] - 20s 27ms/step - loss: 0.4561 - accuracy: 0.7946
Out[277]:
[0.45607948303222656, 0.7946352958679199]
In [278]:
# predict the model
y_pred = model.predict(test_generator)

# get the class with the highest probability
y_pred = np.argmax(y_pred, axis=1)

# get the true class
y_true = test_generator.classes

# get the class labels
class_labels = list(test_generator.class_indices.keys())

# get the classification report
print(classification_report(y_true, y_pred, target_names=class_labels))

# get the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot(cmap='Blues')

# get the precision recall curve
precision, recall, _ = precision_recall_curve(y_true, y_pred)

# plot the precision recall curve
plt.figure(figsize=(9, 5))
plt.plot(recall, precision, marker='o', color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

# get the f1 score
f1 = f1_score(y_true, y_pred)
print(f'F1 Score: {f1}')

# get the accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy}')

# get the precision
precision = precision_score(y_true, y_pred)
print(f'Precision: {precision}')

# get the recall
recall = recall_score(y_true, y_pred)
print(f'Recall: {recall}')

#from the confusion matrix, calculate tn, fp, fn, tp
tn, fp, fn, tp = cm.ravel()
print(f'True Negatives: {tn}')
print(f'False Positives: {fp}')
print(f'False Negatives: {fn}')
print(f'True Positives: {tp}')

# calculate the specificity
specificity = tn / (tn + fp)
print(f'Specificity: {specificity}')
  1/739 [..............................] - ETA: 49s
739/739 [==============================] - 20s 28ms/step
              precision    recall  f1-score   support

           0       0.80      0.78      0.79     11874
           1       0.79      0.81      0.80     11762

    accuracy                           0.79     23636
   macro avg       0.79      0.79      0.79     23636
weighted avg       0.79      0.79      0.79     23636

No description has been provided for this image
No description has been provided for this image
F1 Score: 0.7967166429349192
Accuracy: 0.7946353020815705
Precision: 0.7850775833608452
Recall: 0.8087060023805476
In [271]:
model.evaluate(test_generator)
739/739 [==============================] - 20s 27ms/step - loss: 0.4561 - accuracy: 0.7946
Out[271]:
[0.45607972145080566, 0.7946352958679199]
In [219]:
images, labels = next(train_generator)
In [220]:
# Inspect the first image in the batch
first_image = images[0]

# Check the maximum and minimum values
max_value = first_image.max()
min_value = first_image.min()

print(f"Maximum pixel value: {max_value}")
print(f"Minimum pixel value: {min_value}")
Maximum pixel value: 0.9725490808486938
Minimum pixel value: 0.5333333611488342

VGG 16

In [254]:
conv_base = keras.applications.vgg16.VGG16(
    weights="imagenet",
    include_top=False,
    input_shape=(180, 180, 3)
    )
In [255]:
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_16 (InputLayer)       [(None, 180, 180, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 180, 180, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 180, 180, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 90, 90, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 90, 90, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 90, 90, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 45, 45, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 45, 45, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 22, 22, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 22, 22, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 11, 11, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 5, 5, 512)         0         
                                                                 
=================================================================
Total params: 14,714,688
Trainable params: 14,714,688
Non-trainable params: 0
_________________________________________________________________
In [256]:
conv_base.trainable = False
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_16 (InputLayer)       [(None, 180, 180, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 180, 180, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 180, 180, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 90, 90, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 90, 90, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 90, 90, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 45, 45, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 45, 45, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 22, 22, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 22, 22, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 11, 11, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 5, 5, 512)         0         
                                                                 
=================================================================
Total params: 14,714,688
Trainable params: 0
Non-trainable params: 14,714,688
_________________________________________________________________
In [257]:
inputs = keras.Input(shape=(180, 180, 3))
x = keras.applications.vgg16.preprocess_input(inputs)
x = conv_base(x)
x = layers.Dropout(0.25)(x)
x = layers.Flatten()(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(2, activation="softmax")(x)
model_vgg = keras.Model(inputs, outputs)
In [258]:
model_vgg.summary()
Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_17 (InputLayer)       [(None, 180, 180, 3)]     0         
                                                                 
 tf.__operators__.getitem_9   (None, 180, 180, 3)      0         
 (SlicingOpLambda)                                               
                                                                 
 tf.nn.bias_add_9 (TFOpLambd  (None, 180, 180, 3)      0         
 a)                                                              
                                                                 
 vgg16 (Functional)          (None, 5, 5, 512)         14714688  
                                                                 
 dropout_22 (Dropout)        (None, 5, 5, 512)         0         
                                                                 
 flatten_13 (Flatten)        (None, 12800)             0         
                                                                 
 dense_26 (Dense)            (None, 128)               1638528   
                                                                 
 dropout_23 (Dropout)        (None, 128)               0         
                                                                 
 dense_27 (Dense)            (None, 2)                 258       
                                                                 
=================================================================
Total params: 16,353,474
Trainable params: 1,638,786
Non-trainable params: 14,714,688
_________________________________________________________________
In [259]:
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_16 (InputLayer)       [(None, 180, 180, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 180, 180, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 180, 180, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 90, 90, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 90, 90, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 90, 90, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 45, 45, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 45, 45, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 22, 22, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 22, 22, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 11, 11, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 5, 5, 512)         0         
                                                                 
=================================================================
Total params: 14,714,688
Trainable params: 0
Non-trainable params: 14,714,688
_________________________________________________________________
In [260]:
conv_base.trainable = True
for layer in conv_base.layers[:-4]:
    layer.trainable = False
In [261]:
model_vgg.summary()
Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_17 (InputLayer)       [(None, 180, 180, 3)]     0         
                                                                 
 tf.__operators__.getitem_9   (None, 180, 180, 3)      0         
 (SlicingOpLambda)                                               
                                                                 
 tf.nn.bias_add_9 (TFOpLambd  (None, 180, 180, 3)      0         
 a)                                                              
                                                                 
 vgg16 (Functional)          (None, 5, 5, 512)         14714688  
                                                                 
 dropout_22 (Dropout)        (None, 5, 5, 512)         0         
                                                                 
 flatten_13 (Flatten)        (None, 12800)             0         
                                                                 
 dense_26 (Dense)            (None, 128)               1638528   
                                                                 
 dropout_23 (Dropout)        (None, 128)               0         
                                                                 
 dense_27 (Dense)            (None, 2)                 258       
                                                                 
=================================================================
Total params: 16,353,474
Trainable params: 8,718,210
Non-trainable params: 7,635,264
_________________________________________________________________
In [262]:
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_16 (InputLayer)       [(None, 180, 180, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 180, 180, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 180, 180, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 90, 90, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 90, 90, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 90, 90, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 45, 45, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 45, 45, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 45, 45, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 22, 22, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 22, 22, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 22, 22, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 11, 11, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 11, 11, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 5, 5, 512)         0         
                                                                 
=================================================================
Total params: 14,714,688
Trainable params: 7,079,424
Non-trainable params: 7,635,264
_________________________________________________________________
In [263]:
model_vgg.summary()
Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_17 (InputLayer)       [(None, 180, 180, 3)]     0         
                                                                 
 tf.__operators__.getitem_9   (None, 180, 180, 3)      0         
 (SlicingOpLambda)                                               
                                                                 
 tf.nn.bias_add_9 (TFOpLambd  (None, 180, 180, 3)      0         
 a)                                                              
                                                                 
 vgg16 (Functional)          (None, 5, 5, 512)         14714688  
                                                                 
 dropout_22 (Dropout)        (None, 5, 5, 512)         0         
                                                                 
 flatten_13 (Flatten)        (None, 12800)             0         
                                                                 
 dense_26 (Dense)            (None, 128)               1638528   
                                                                 
 dropout_23 (Dropout)        (None, 128)               0         
                                                                 
 dense_27 (Dense)            (None, 2)                 258       
                                                                 
=================================================================
Total params: 16,353,474
Trainable params: 8,718,210
Non-trainable params: 7,635,264
_________________________________________________________________
In [264]:
model_vgg.compile(loss="categorical_crossentropy",
              optimizer=keras.optimizers.Adadelta(),
              metrics=["accuracy"])

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath="./models/finetune-vgg16.keras",
        save_best_only=True,
        monitor="val_loss")
]
history_vgg = model_vgg.fit(
    train_generator,
    epochs=30,
    batch_size=1024,
    validation_data=valid_generator,
    callbacks=callbacks)
Epoch 1/30
  14/3447 [..............................] - ETA: 7:59 - loss: 1.5627 - accuracy: 0.5424
3447/3447 [==============================] - 649s 188ms/step - loss: 0.7689 - accuracy: 0.5350 - val_loss: 0.6613 - val_accuracy: 0.6381
Epoch 2/30
3447/3447 [==============================] - 695s 202ms/step - loss: 0.6585 - accuracy: 0.6152 - val_loss: 0.6267 - val_accuracy: 0.6638
Epoch 3/30
3447/3447 [==============================] - 683s 198ms/step - loss: 0.6332 - accuracy: 0.6489 - val_loss: 0.6100 - val_accuracy: 0.6713
Epoch 4/30
3447/3447 [==============================] - 707s 205ms/step - loss: 0.6150 - accuracy: 0.6681 - val_loss: 0.5885 - val_accuracy: 0.6942
Epoch 5/30
3447/3447 [==============================] - 824s 239ms/step - loss: 0.6021 - accuracy: 0.6804 - val_loss: 0.5785 - val_accuracy: 0.7001
Epoch 6/30
3447/3447 [==============================] - 934s 271ms/step - loss: 0.5940 - accuracy: 0.6878 - val_loss: 0.5761 - val_accuracy: 0.7022
Epoch 7/30
3447/3447 [==============================] - 936s 271ms/step - loss: 0.5872 - accuracy: 0.6934 - val_loss: 0.5835 - val_accuracy: 0.6949
Epoch 8/30
3447/3447 [==============================] - 887s 257ms/step - loss: 0.5833 - accuracy: 0.6980 - val_loss: 0.5683 - val_accuracy: 0.7088
Epoch 9/30
3447/3447 [==============================] - 888s 258ms/step - loss: 0.5798 - accuracy: 0.7001 - val_loss: 0.5644 - val_accuracy: 0.7121
Epoch 10/30
3447/3447 [==============================] - 884s 257ms/step - loss: 0.5771 - accuracy: 0.7030 - val_loss: 0.6800 - val_accuracy: 0.6266
Epoch 11/30
3447/3447 [==============================] - 894s 259ms/step - loss: 0.5754 - accuracy: 0.7048 - val_loss: 0.5682 - val_accuracy: 0.7065
Epoch 12/30
3447/3447 [==============================] - 917s 266ms/step - loss: 0.5734 - accuracy: 0.7049 - val_loss: 0.5730 - val_accuracy: 0.7043
Epoch 13/30
3447/3447 [==============================] - 910s 264ms/step - loss: 0.5728 - accuracy: 0.7063 - val_loss: 0.5672 - val_accuracy: 0.7089
Epoch 14/30
3447/3447 [==============================] - 902s 262ms/step - loss: 0.5694 - accuracy: 0.7089 - val_loss: 0.5697 - val_accuracy: 0.7063
Epoch 15/30
3447/3447 [==============================] - 797s 231ms/step - loss: 0.5685 - accuracy: 0.7090 - val_loss: 0.5811 - val_accuracy: 0.7005
Epoch 16/30
3447/3447 [==============================] - 745s 216ms/step - loss: 0.5670 - accuracy: 0.7125 - val_loss: 0.5773 - val_accuracy: 0.7003
Epoch 17/30
3447/3447 [==============================] - 739s 214ms/step - loss: 0.5666 - accuracy: 0.7129 - val_loss: 0.5853 - val_accuracy: 0.7016
Epoch 18/30
3447/3447 [==============================] - 749s 217ms/step - loss: 0.5640 - accuracy: 0.7145 - val_loss: 0.5588 - val_accuracy: 0.7150
Epoch 19/30
3447/3447 [==============================] - 740s 215ms/step - loss: 0.5635 - accuracy: 0.7162 - val_loss: 0.5740 - val_accuracy: 0.7048
Epoch 20/30
3447/3447 [==============================] - 742s 215ms/step - loss: 0.5622 - accuracy: 0.7165 - val_loss: 0.5468 - val_accuracy: 0.7265
Epoch 21/30
3447/3447 [==============================] - 747s 217ms/step - loss: 0.5608 - accuracy: 0.7178 - val_loss: 0.5552 - val_accuracy: 0.7203
Epoch 22/30
3447/3447 [==============================] - 741s 215ms/step - loss: 0.5590 - accuracy: 0.7176 - val_loss: 0.5445 - val_accuracy: 0.7284
Epoch 23/30
3447/3447 [==============================] - 739s 214ms/step - loss: 0.5591 - accuracy: 0.7197 - val_loss: 0.5588 - val_accuracy: 0.7180
Epoch 24/30
3447/3447 [==============================] - 751s 218ms/step - loss: 0.5579 - accuracy: 0.7189 - val_loss: 0.6143 - val_accuracy: 0.6788
Epoch 25/30
3447/3447 [==============================] - 741s 215ms/step - loss: 0.5572 - accuracy: 0.7202 - val_loss: 0.5756 - val_accuracy: 0.7035
Epoch 26/30
3447/3447 [==============================] - 737s 214ms/step - loss: 0.5561 - accuracy: 0.7210 - val_loss: 0.5467 - val_accuracy: 0.7253
Epoch 27/30
3447/3447 [==============================] - 745s 216ms/step - loss: 0.5554 - accuracy: 0.7233 - val_loss: 0.5420 - val_accuracy: 0.7315
Epoch 28/30
3447/3447 [==============================] - 743s 215ms/step - loss: 0.5540 - accuracy: 0.7217 - val_loss: 0.5498 - val_accuracy: 0.7263
Epoch 29/30
3447/3447 [==============================] - 751s 218ms/step - loss: 0.5519 - accuracy: 0.7237 - val_loss: 0.5439 - val_accuracy: 0.7306
Epoch 30/30
3447/3447 [==============================] - 714s 207ms/step - loss: 0.5504 - accuracy: 0.7253 - val_loss: 0.5498 - val_accuracy: 0.7255
In [266]:
history_df_vgg = pd.DataFrame(history_vgg.history)
history_df_vgg.insert(0, 'epoch', range(1, len(history_df_vgg) + 1))
history_df_vgg
Out[266]:
epoch loss accuracy val_loss val_accuracy
0 1 0.768852 0.534968 0.661330 0.638137
1 2 0.658532 0.615222 0.626669 0.663818
2 3 0.633195 0.648876 0.610023 0.671264
3 4 0.614975 0.668087 0.588496 0.694195
4 5 0.602072 0.680381 0.578502 0.700118
5 6 0.594014 0.687770 0.576064 0.702192
6 7 0.587167 0.693427 0.583505 0.694872
7 8 0.583271 0.698024 0.568290 0.708834
8 9 0.579839 0.700091 0.564396 0.712092
9 10 0.577093 0.703046 0.680026 0.626587
10 11 0.575370 0.704805 0.568176 0.706549
11 12 0.573445 0.704914 0.572976 0.704307
12 13 0.572816 0.706256 0.567196 0.708876
13 14 0.569372 0.708903 0.569712 0.706253
14 15 0.568541 0.708994 0.581116 0.700499
15 16 0.566963 0.712493 0.577310 0.700330
16 17 0.566581 0.712892 0.585278 0.701599
17 18 0.563980 0.714515 0.558843 0.715011
18 19 0.563463 0.716174 0.574028 0.704815
19 20 0.562204 0.716528 0.546846 0.726477
20 21 0.560795 0.717842 0.555157 0.720257
21 22 0.559017 0.717625 0.544491 0.728423
22 23 0.559097 0.719746 0.558807 0.717973
23 24 0.557906 0.718948 0.614282 0.678837
24 25 0.557239 0.720199 0.575618 0.703461
25 26 0.556065 0.721024 0.546726 0.725334
26 27 0.555357 0.723345 0.541951 0.731511
27 28 0.553954 0.721750 0.549792 0.726307
28 29 0.551855 0.723717 0.543875 0.730580
29 30 0.550425 0.725295 0.549761 0.725461
In [267]:
# Plot the training and validation loss
plt.figure(figsize=(9, 5))
values = history_df_vgg['accuracy']
epochs = range(1, len(values) + 1)
plt.plot(epochs, history_df_vgg['loss'], 'bo', label='Training loss')
plt.plot(epochs, history_df_vgg['val_loss'], 'ro', label='Validation loss')

plt.xlabel('Epochs')
plt.xticks(epochs)
plt.ylabel('Loss')
plt.legend()
plt.title('Training and validation loss')
plt.show()

# Plot the training and validation accuracy
plt.figure(figsize=(9, 5))
plt.plot(epochs, history_df_vgg['accuracy'], 'bo', label='Training accuracy')
plt.plot(epochs, history_df_vgg['val_accuracy'], 'ro', label='Validation accuracy')

plt.xlabel('Epochs')
plt.xticks(epochs)
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and validation accuracy')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [279]:
# predict the model
y_pred_vgg = model_vgg.predict(test_generator)

# get the class with the highest probability
y_pred_vgg = np.argmax(y_pred_vgg, axis=1)

# get the true class
y_true = test_generator.classes

# get the class labels
class_labels = list(test_generator.class_indices.keys())

# get the classification report
print(classification_report(y_true, y_pred_vgg, target_names=class_labels))

# get the confusion matrix
cm_vgg = confusion_matrix(y_true, y_pred_vgg)

# plot the confusion matrix
disp_vgg = ConfusionMatrixDisplay(confusion_matrix=cm_vgg, display_labels=class_labels)
disp_vgg.plot(cmap='Blues')

# get the precision recall curve
precision_vgg, recall_vgg, _ = precision_recall_curve(y_true, y_pred_vgg)

# plot the precision recall curve
plt.figure(figsize=(9, 5))
plt.plot(recall_vgg, precision_vgg, marker='o', color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

# get the f1 score
f1_vgg = f1_score(y_true, y_pred_vgg)
print(f'F1 Score: {f1}')

# get the accuracy
accuracy_vgg = accuracy_score(y_true, y_pred_vgg)
print(f'Accuracy: {accuracy_vgg}')

# get the precision
precision_vgg = precision_score(y_true, y_pred_vgg)
print(f'Precision: {precision_vgg}')

# get the recall
recall_vgg = recall_score(y_true, y_pred_vgg)
print(f'Recall: {recall_vgg}')

#from the confusion matrix, calculate tn, fp, fn, tp
tn_vgg, fp_vgg, fn_vgg, tp_vgg = cm.ravel()
print(f'True Negatives: {tn_vgg}')
print(f'False Positives: {fp_vgg}')
print(f'False Negatives: {fn_vgg}')
print(f'True Positives: {tp_vgg}')

# calculate the specificity
specificity_vgg = tn_vgg / (tn_vgg + fp_vgg)
print(f'Specificity: {specificity_vgg}')
739/739 [==============================] - 44s 59ms/step
              precision    recall  f1-score   support

           0       0.78      0.61      0.69     11874
           1       0.68      0.83      0.75     11762

    accuracy                           0.72     23636
   macro avg       0.73      0.72      0.72     23636
weighted avg       0.73      0.72      0.72     23636

No description has been provided for this image
No description has been provided for this image
F1 Score: 0.7967166429349192
Accuracy: 0.7189456760873244
Precision: 0.6778295004516084
Recall: 0.829450773677946
True Negatives: 9270
False Positives: 2604
False Negatives: 2250
True Positives: 9512
Specificity: 0.7806973218797373